/**********************************************************************/
/*
   Author: Robbie Dulin, Michelle Han
   Created: 5 May 2021
   Last Updated: Apr 2024
   Description: Cleans raw August 2018 SAKERNAS data.

   This cleaning file should output 2 different datasets:
   1. Cleaned SAKERNAS data:
   sak_aug18_deid_clean
   2. Subset of SAKERNAS person-batch data matched with PMO for SAKERNAS analysis:
  sak_aug18_deid_clean_merged

*/
/**********************************************************************/
* include filepaths 
if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
* Log
cap log close
global prefix: display %tdCYND td(`c(current_date)')
log using "$KP_logs/${prefix}_clean_SAKERNAS_aug2018.txt", text replace

clear
set more off

* Load data
  u "$KP_deid_sakernas/Raw/SAKERNAS_PRAKERJA_18AUG_deid.dta", clear
  di _N

/*----------------------------------------------------*/
              /* Section: Demographics */
/*----------------------------------------------------*/

* NOTE: no HH id in this survey
* number of HH members
  gen hh_size = b2_r1

* year of birth
  gen year_dob = b4_k7_th

* age
  summ b4_k8
  gen age = b4_k8
  recode age (0/20 = 1) (21/40 = 2) (41/60 = 3) (61/80 = 4) (81/100 = 5), gen(age_cat)

* Dummy for age <=30 
	gen young = 0 
	replace young = 1 if age <= 30
	la def young 0 "Over 30 Years Old" 1 "30 and Under"
	la val young young 

* NOTE: no HH id in this survey

* relationship to HH head
  tab b4_k3, m
  gen relation_hh_head = b4_k3
  la def relations 1 "HH head" 2 "Spouse" 3 "Son/Daughter" 4 "Step/adopted child" 5 "Son/Daughter-in-law" 6 "Grandchild" 7 "Parent/Parent-in-law" 8 "Other family" 9 "Housemaid" 10 "Driver/Gardener" 11 "No relation", replace
  la val relation_hh_head relations
  tab relation_hh_head

* marital status
  tab b4_k10, m
  gen married = b4_k10 == 2
  gen divorced = b4_k10 == 3
  gen widowed = b4_k10 == 4
  gen single = b4_k10 == 1

* gender
  gen sex = b4_k6 == 1
  label def sex 1 "male"  0 "female"
  label val sex sex
  gen female = b4_k6 == 2
  tab b4_k6, m

* current student
  tab b4_k9, m
  gen current_student = b4_k9 == 2

* education levels
  tab b5_r1a, m
  gen no_elementary = b5_r1a == 1
  gen elementary = 2 <= b5_r1a & b5_r1a <= 4
  gen junior_high = 5 <= b5_r1a & b5_r1a <= 7
  gen high_school = 8 <= b5_r1a & b5_r1a <= 11
  gen tertiary = 12 <= b5_r1a & b5_r1a <= 16
  summ no_elementary elementary junior_high high_school tertiary

* Dummy for educated (graduating high school)
  gen educated = high_school 
  la def educated 0 "Less than 12 Years Schooling" 1 "12 or More Years of Schooling"
  la val educated educated

* DH: tried to uniformize with au20
	gen educ = b5_r1a
	gen school_years = 3 if educ == 1 // No elementary
	replace school_years = 6 if inlist(educ, 2, 3, 4) // elementary: SD / MI / SDLB / Package A
	replace school_years = 9 if inlist(educ, 5,6,7) // junior high : SMP/MTs/SMPLB/Package B
	replace school_years = 12 if inlist(educ, 8,9,10,11) // high (8-11): SMA / MA / SMLB / Paket C AND VOCATIONAL SCHOOL/MAK
	replace school_years = 14 if inlist(educ, 12,13) // Diploma I/II/III
	replace school_years = 16 if inlist(educ, 14) // Diploma IV and S1
	replace school_years = 18 if inlist(educ, 15,16) // S2/S3

* training courses
  tab b5_r1d, m
  gen train_certif = b5_r1d == 1

  tab b5_r1f, m
  gen current_training = b5_r1f == 1

* live in the same kabupaten you were born in?
  gen born_kab = b5_r2b
  gen born_live_same = born_kab == kode_kab

* city
  gen city_dummy = kode_kab >= 70 & !mi(kode_kab)

* migration (compare using both kabupaten and province)
  tab b5_r3a, m
  tab b5_r3b, m
  gen migrated = (kode_prov !=  b5_r3a) | (kode_kab != b5_r3b)
  tab migrated, m

* disabilities
  tab b5_r4a, m
  gen vision_disabled = b5_r4a == 2 | b5_r4a == 3
  gen vision_disability = b5_r4a

  tab b5_r4b, m
  gen hearing_disabled = b5_r4b == 5 | b5_r4b == 6
  gen hearing_disability = b5_r4b

  tab b5_r4c, m
  gen walk_disabled = b5_r4c == 2 | b5_r4c == 3
  gen walk_disability = b5_r4c

  tab b5_r4d, m
  gen hand_disabled = b5_r4d == 5 | b5_r4d == 6
  gen hand_disability = b5_r4d

  tab b5_r4e, m
  gen speech_disabled = b5_r4e == 2 | b5_r4e == 3
  gen speech_disability = b5_r4e

  tab b5_r4f, m
  gen other_disabled = b5_r4f == 5 | b5_r4f == 6
  gen other_disability = b5_r4f

  summ *disabled

  gen any_disability = (vision_disabled + hearing_disabled + walk_disabled + hand_disabled + speech_disabled + other_disabled) >= 1
  tab any_disability, m

  gen severe_disability = b5_r4a == 3 | b5_r4b == 6 | b5_r4c == 3 | b5_r4d == 6 | b5_r4e == 3 | b5_r4f == 6
  tab severe_disability


* urban-rural
  tab klasifikas, m
  gen urban = klasifikas == 1


/*----------------------------------------------------*/
                /* Section: Employment */
/*----------------------------------------------------*/

* employed (according to survey definition)
  * employed if worked uninterrupted hour last week
  tab b5_r5a1, m
  gen employed = b5_r5a1 == 1

  * employed if usually works uninterrupted hour, but temporarily not at work
  tab b5_r6, m
  replace employed = 1 if b5_r6 == 1

  * employed if worked cumulative hour but not uninterrupted hour
  tab b5_r7a, m
  replace employed = 1 if b5_r7a == 1

  * employed if usually work cumulative hour, but temporarily not at work
  tab b5_r7b, m
  replace employed = 1 if b5_r7b == 1
  tab employed, m

* employment status
  tab b5_r27a, m
  assert b5_r27a == 0 if employed == 0
  gen employment_status = b5_r27a
  label define status 0 "Not Working" 1 "Self-employed" 2 "Business owner with temporary/unpaid workers" 3 "Business owner with paid workers" 4 "Employee" 5 "Temporary worker in agriculture" 6 "Temporary worker (non-agriculture)" 7 "Family/unpaid worker", replace
  la val employment_status status
  tab employment_status, m

  * Type of work: the categories including permanent job, self employed, business owner, etc (12A)
  gen self_employed = employment_status == 1
  gen business_owner = inlist(employment_status, 2, 3)
  gen self_emp_bus_owner = inlist(employment_status, 1, 2, 3)
  gen perm_worker = employment_status == 4
  gen temp_worker = inlist(employment_status, 5, 6)
  gen family_unpaid = employment_status == 7

* were you employed last week?
  replace employed = inlist(employment_status, 1, 2, 3, 4, 5, 6)
  tab employed, m
  tab employed employment_status, m row

  * Note: 1 or 3 = yes, 2 or 4 = no, 0 = not asked
  gen reg_job_mkt = b5_r19a1 == 1
  gen contact_company = b5_r19a2 == 3
  gen advert_online = b5_r19a4 == 3
  gen search_network = b5_r19a5 == 1
  gen any_attempt = b5_r19a6 == 3

* Business field
  // 17-sector classification: (https://www.bps.go.id/statictable/2016/01/06/1898/klasifikasi-17-sektor-tabel-input-output-indonesia-2010.html)
  // old code: (https://sirusa.bps.go.id/sirusa/index.php/variabel/85)
  tab b5_r23_sek, m
  tab b5_r23, m
  la def business_fields 1 "Agriculture, Forestry and Fisheries" 2 "Mining and excavation" 3 "Industrial Production" 4 "Electricity and Gas" ///
    5 "Water Supply, Waste Management, and Recycling" 6 "Construction" 7 "Wholesale and Retail Trade, Car and Motorcycle Repair" ///
    8 "Transportation and Warehousing" 9 "Hospitality and Restaurants" 10 "Infomation and Communication" 11 "Financial Services and Insurance" ///
    12 "Real Estate" 13 "Business Services" 14 "Government Administration, Defense and Social Security" 15 "Education" 16 "Health and Social Services" 17 "Other", replace
  gen business_field = b5_r23_sek if employed == 1
  la val business_field business_fields
  tab business_field, m


* KBJI code (https://www.bps.go.id/website/fileMenu/KBJI-2014.pdf)
  // 1982 KJI single digit
  tab b5_r24_kji, m

  // KBJI 2014 single digit
  tab b5_r24_kbj, m
  la def occupations 0 "Army and Police" 1 "Manager" 2 "Professional" 3 "Technicians and Assistant Professionals" 4 "Administrative Personnel" 5 "Business Services and Sales Personnel" 6 "Skilled Workers in Agriculture, Forestry and Fisheries" 7 "Production, Craft, and Related Workers" 8 "Machine Operators and Assembly Workers" 9 "Blue-collar workers", replace
  gen occupation = b5_r24_kbj if employed == 1
  la val occupation occupations
  tab occupation, m

* wage last month (assuming this is January)
  summ b5_r31b1 b5_r31b2 if inlist(employment_status, 1, 5, 6)
  summ b5_r31b1 b5_r31b2 if !inlist(employment_status, 1, 5, 6)
  tab b5_r31b1 if !inlist(employment_status, 1, 5, 6)
  tab b5_r31b2 if !inlist(employment_status, 1, 5, 6)

  summ b5_r31c1 b5_r31c2 if inlist(employment_status, 4)
  summ b5_r31c1 b5_r31c2 if !inlist(employment_status, 4)
  tab b5_r31c1 if !inlist(employment_status, 1, 5, 6)
  tab b5_r31c2 if !inlist(employment_status, 1, 5, 6)
  count if b5_r31c2 != 0 & b5_r31b2 != 0
  // NOTE: there are a number of respondents to these questions who should not have been asked it according to the survey flow
  // seems like info was just entered under wrong question
  // some observations appear to duplicate the income, will delete the duplicate
  count if b5_r31c1 != 0 & b5_r31b1 != 0
  list employment_status b5_r31b1 b5_r31b2 b5_r31c1 b5_r31c2 if b5_r31c1 != 0 & b5_r31b1 != 0
  replace b5_r31b1 = 0 if b5_r31c1 != 0 & b5_r31b1 != 0

* Earnings including business ownership
  gen earnings = b5_r31b1 + b5_r31b2 + b5_r31c1 + b5_r31c2 if inlist(employment_status, 1, 4, 5, 6)
  summ earnings if inlist(employment_status, 1, 4, 5, 6), d
  count if mi(earnings)

* Individual earnings, including zero if not employed
  gen ind_earnings = earnings
  replace ind_earnings = 0 if employed == 0
  sum ind_earnings, d

  gen ind_adj_earnings = ind_earnings
  sum ind_adj_earnings if ind_adj_earnings > 0, d
  replace ind_adj_earnings = r(p99) if ind_adj_earnings > r(p99) & !mi(ind_adj_earnings)
  replace ind_adj_earnings = r(p1) if ind_adj_earnings < r(p1) & !mi(ind_adj_earnings) & ind_adj_earnings != 0
  sum ind_adj_earnings, d
  gen ind_pos_adj_earnings = ind_adj_earnings if ind_adj_earnings > 0

  gen ind_earnings_pos = ind_earnings > 0 if !mi(ind_earnings)

* hours worked last week
  summ b5_r26a if inlist(employment_status, 1, 4, 5, 6)
  di `r(max)'/7
  // max is 14 hours per day
  hist b5_r26a
  gen hours_worked = b5_r26a
  summ hours_worked

* hourly wage (income per day / hours per day)
* raw wages and hours, excluding business owners
  gen hourly_wage_raw = (earnings / 31) / (hours_worked / 7)

* Hourly wage, including business owners
* Income trimmed to 99th percentile
  gen hourly_wage = (ind_adj_earnings/31)/(hours_worked / 7) if hours_worked != 0
  replace hourly_wage = 0 if hours_worked == 0
  gen poswage = hourly_wage if hourly_wage > 0

* Trim 1% and 99% percentiles
  gen hourly_adj_wage = hourly_wage
  sum hourly_adj_wage if hourly_adj_wage > 0, d
  replace hourly_adj_wage = r(p99) if hourly_adj_wage > r(p99) & !mi(hourly_adj_wage)
  replace hourly_adj_wage = r(p1) if hourly_adj_wage < r(p1) & !mi(hourly_adj_wage) & hourly_adj_wage != 0
  sum hourly_adj_wage, d

* Positive wages
  gen poswage_adj = hourly_adj_wage if hourly_adj_wage > 0
  gen poswage_dummy = hourly_adj_wage > 0 if !mi(hourly_adj_wage)

* Replace wages as missing if no hours are worked
  gen hourly_wage_no0 = hourly_wage
  replace hourly_wage_no0 = . if hours_worked == 0

* Positive wages
  gen poswage_no0 = hourly_wage_no0 if hourly_wage_no0 > 0

* Trim 1% and 99% percentiles
  gen hourly_adj_wage_no0 = hourly_wage_no0
  sum hourly_adj_wage_no0 if hourly_adj_wage_no0 > 0, d
  replace hourly_adj_wage_no0 = r(p99) if hourly_adj_wage_no0 > r(p99) & !mi(hourly_adj_wage_no0)
  sum hourly_adj_wage_no0, d
  gen poswage_adj_no0 = hourly_adj_wage_no0 if hourly_adj_wage_no0 > 0



  // Exchange rate: 14,403.60 rp/$
  // median monthly salary (assuming 40 hours/day, 5 days/week)
  di `r(p50)' * 4 * 5 * 8
  di `r(p50)' * 4 * 5 * 8 / 14400

* work tenure
  tab b5_r25a1, m
  tab b5_r25a2i, m
  tab b5_r25a2ii, m

  * indicators for whether job tenure is less than or greater than 12 months
  gen tenure_year_or_less = b5_r25a1 != 0
  gen tenure_year_more =  b5_r25a2i != 0
  tab tenure_year_or_less tenure_year_more

  * for month variable for those with tenure > 1 year, set to middle of year if value is 99 (assume this is don't know)
  gen b5_r25a2ii_no99 = b5_r25a2ii
  replace b5_r25a2ii_no99 = 6 if b5_r25a2ii == 99

  * create tenure in months
  // first the total months for those with tenure < 1 year
  gen tenure_months = b5_r25a1 if tenure_year_or_less == 1
  summ tenure_months

  // next the total months for those with tenure > 1 year
  replace tenure_months = (12 * b5_r25a2i) + b5_r25a2ii_no99 if tenure_year_more == 1
  summ tenure_months
  count if tenure_months == .
  replace tenure_months = 0 if tenure_months == .
  drop tenure_year_or_less tenure_year_more b5_r25a2ii_no99
  summ tenure_months, d

  * check
  gen age_months = 12 * age
  // subtract 48 months to get number of months one has been at least 4 years old (approximately)
  replace age_months = age_months - 48
  // check that respondents do not report being at same job since from before they were 4 years old
  summ age_months
  assert tenure_months < age_months if tenure_months != .

* use internet at work
  tab b5_r28a, m
  // make missing if not working
  gen use_internet_work = b5_r28a == 1
  replace use_internet_work = 0 if employed == 0
  tab use_internet_work

* had more than one job
  tab b5_r40a, m
  tab b5_r40b, m
  * if worked one consecutive or cumulative hour at another job
  gen multiple_jobs = b5_r40a == 1 | b5_r40b == 1
  tab multiple_jobs, m

* leave job in past year
  tab b5_r51, m
  gen leave_job = b5_r51 == 1

* job search/business creation
  tab b5_r15a, m
  gen looking_job = b5_r15a == 1

  gen activities_job = inlist(1, b5_r19a1, b5_r19a2, b5_r19a3, b5_r19a4, b5_r19a5, b5_r19a6)
  label var activities_job "Any effort to find a job"

  tab b5_r15b, m
  gen prep_new_bus = b5_r15b == 1
 * bys hh_id_s: egen prep_bus_hh = max(prep_new_bus)

* labor force
  tab employed, m
  gen labor_force = employed == 1 | looking_job == 1 | prep_new_bus == 1
  tab labor_force, m

******************************************************
* check duplicates
	duplicates tag anon_id4, gen(anon_id4_dupe)
	tab anon_id4_dupe

* drop duplicates 
	drop if anon_id4_dupe != 0 & !missing(anon_id4)
	drop anon_id4_dupe

********************************************************************************
*			Export SAKERNAS Aug '18 cleaned data				       *
********************************************************************************

  rename final_weig weight
  drop merge1 merge2
  datasignature 
  if "`r(datasignature)'" == "508426:281(106964):3534274313:1605150139" {
    save "$KP_deid_sakernas/Clean/sak_aug18_deid_clean.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
  

********************************************************************************
*			Merge with admin data			       *
********************************************************************************

  keep if !mi(userid_bps) | !mi(anon_id4)

  gen sak_round = 1

  keep anon_id4 ///
	userid_bps ///
	hh_size ///
	year_dob ///
	age ///
	age_cat ///
	relation_hh_head ///
	married ///
	divorced ///
	widowed ///
	single ///
	sex ///
	female ///
	current_student ///
	no_elementary ///
	elementary ///
	junior_high ///
	high_school ///
	tertiary ///
	educ ///
	school_years ///
	train_certif ///
	current_training ///
	born_* ///
	city_dummy ///
	migrated ///
	employed ///
	self_employed ///
	business_owner ///
  self_emp_bus_owner ///
	perm_worker ///
	temp_worker ///
	family_unpaid ///
	business_field ///
	occupation ///
	ind_* ///
	hours_worked ///
	hourly_* ///
	poswage_* ///
	tenure_months ///
	age_months ///
	use_internet_work ///
	multiple_jobs ///
	leave_job ///
	looking_job ///
	activities_job ///
	prep_new_bus ///
	labor_force ///
  vision_* ///
  hearing_* ///
  walk_* ///
  hand_* ///
  speech_* ///
  other_* ///
  reg_job_mkt ///
  contact_company ///
  advert_online ///
  search_network ///
  any_attempt ///
  bpsonly ///
  sak_round


* merge
  preserve
	keep if !missing(anon_id4)
	tempfile sak_anon_id
	save `sak_anon_id'
  restore

  use "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", clear

  merge m:1 anon_id4 using `sak_anon_id', nogen keep(3)

	compress
  datasignature 
  if "`r(datasignature)'" == "111447:147(43899):3985367117:3136695850" {
    save "$KP_deid_sakernas/Clean/sak_aug18_deid_clean_merged.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
	
  // done
